`— title: “STA 222 Final Project” output: html_document date: “2022-12-03” authors: Kate Jones, Gianni Spiga, Yichu Chen, Gu Gong —

Loading Libraries

library(survival)
library(KMsurv)
library(ggplot2)
library(ggpubr)
library(survminer)
## 
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
## 
##     myeloma
library(muhaz)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
library(tidyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
library(gridExtra)

Cleaning Data

# Loading in data
data(std)
std0 = std
# Checking for NA Values in data set 
any(is.na(std))
## [1] FALSE
factor.cols <-
  c(
    "race",
    # Race (W=white, B=black)
    "marital",
    # Marital status (D=divorced / separated, M=married, S=single)
    "iinfct",
    # Initial infection (1= gonorrhea, 2=chlamydia, 3=both)
    "os12m",
    # Oral sex within 12 months (1=yes, 0=no)
    "os30d",
    # Oral sex within 30 days (1=yes, 0=no)
    "rs12m",
    # Rectal sex within 12 months (1=yes, 0=no)
    "rs30d",
    # Rectal sex within 30 days (1=yes, 0=no)
    "abdpain",
    # Presence of abdominal pain (1=yes, 0=no)
    "discharge",
    # Sign of discharge (1=yes, 0=no)
    "dysuria",
    # Sign of dysuria (1=yes, 0=no)
    "condom",
    # Condom use (1=always, 2=sometime, 3=never)
    "itch",
    # Sign of itch (1=yes, 0=no)
    "lesion",
    # Sign of lesion (1=yes, 0=no)
    "rash",
    # Sign of rash (1=yes, 0=no)
    "lymph",
    # Sign of lymph (1=yes, 0=no)
    "vagina",
    # Involvement vagina at exam (1=yes, 0=no)
    "dchexam",
    # Discharge at exam (1=yes, 0=no)
    "abnode" # Abnormal node at exam (1=yes, 0=no)
  )
std[factor.cols] <- lapply(std[factor.cols], as.factor)

std$race <-
  mapvalues(std$race,
            from = c("W", "B"),
            to = c("White", "Black"))
std$marital <-
  mapvalues(
    std$marital,
    from = c("D", "M", "S"),
    to = c("Divorced/Separated", "Married", "Single")
  )
std$iinfct <-
  mapvalues(
    std$iinfct,
    from = c("1", "2", "3"),
    to = c("gonorrhea", "chlamydia", "both")
  )
std$condom <-
  mapvalues(
    std$condom,
    from = c("1", "2", "3"),
    to = c("always", "sometime", "never")
  )

Exploratory Data Analysis

Checking Model Assumptions

# Building the Survival Object 
infect <- std$iinfct
surv_object <- Surv(time = std$time, event = std$rinfct)
## Call:
## survdiff(formula = surv_object ~ iinfct, data = std)
## 
##                    N Observed Expected (O-E)^2/E (O-E)^2/V
## iinfct=gonorrhea 140       73     54.5   6.28042   7.50617
## iinfct=chlamydia 396      135    153.0   2.12201   3.81136
## iinfct=both      341      139    139.5   0.00166   0.00278
## 
##  Chisq= 8.5  on 2 degrees of freedom, p= 0.01

Cumulative Hazards and CLogLog

Not surprisingly, we can see that the cumulative hazard grows much faster over time when Gonorrhea is the initial infection instead of Chlamydia and being initially infected with both.

## Call:
## coxph(formula = surv_object ~ iinfct, data = std)
## 
##   n= 877, number of events= 347 
## 
##                    coef exp(coef) se(coef)      z Pr(>|z|)   
## iinfctchlamydia -0.4202    0.6569   0.1457 -2.884  0.00393 **
## iinfctboth      -0.2980    0.7423   0.1450 -2.055  0.03984 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6569      1.522    0.4937    0.8741
## iinfctboth         0.7423      1.347    0.5587    0.9863
## 
## Concordance= 0.524  (se = 0.016 )
## Likelihood ratio test= 7.93  on 2 df,   p=0.02
## Wald test            = 8.37  on 2 df,   p=0.02
## Score (logrank) test = 8.46  on 2 df,   p=0.01
##        chisq df    p
## iinfct  2.88  2 0.24
## GLOBAL  2.88  2 0.24

Building the Model

cox1 <-
  coxph(
    surv_object ~ iinfct + marital + race + os12m + os30d +
      rs12m + rs30d + abdpain + discharge + dysuria + condom + 
      itch + lesion + rash + lymph + vagina + dchexam + abnode +
      age + yschool + npartner,
    data = std
  )
summary(cox1)
## Call:
## coxph(formula = surv_object ~ iinfct + marital + race + os12m + 
##     os30d + rs12m + rs30d + abdpain + discharge + dysuria + condom + 
##     itch + lesion + rash + lymph + vagina + dchexam + abnode + 
##     age + yschool + npartner, data = std)
## 
##   n= 877, number of events= 347 
## 
##                      coef exp(coef)  se(coef)      z Pr(>|z|)   
## iinfctchlamydia -0.334628  0.715604  0.149647 -2.236  0.02534 * 
## iinfctboth      -0.267515  0.765279  0.149987 -1.784  0.07449 . 
## maritalMarried   0.055058  1.056602  0.431303  0.128  0.89842   
## maritalSingle    0.408097  1.503953  0.295341  1.382  0.16704   
## raceWhite       -0.111462  0.894526  0.141327 -0.789  0.43030   
## os12m1          -0.206151  0.813711  0.212028 -0.972  0.33091   
## os30d1          -0.339983  0.711783  0.238657 -1.425  0.15428   
## rs12m1           0.033955  1.034538  0.445166  0.076  0.93920   
## rs30d1          -0.194771  0.823023  0.565199 -0.345  0.73039   
## abdpain1         0.229308  1.257729  0.156236  1.468  0.14219   
## discharge1       0.114691  1.121527  0.114283  1.004  0.31559   
## dysuria1         0.164089  1.178320  0.155157  1.058  0.29025   
## condomsometime  -0.064082  0.937928  0.239642 -0.267  0.78916   
## condomnever     -0.321968  0.724721  0.247790 -1.299  0.19382   
## itch1           -0.147451  0.862905  0.154774 -0.953  0.34075   
## lesion1         -0.183670  0.832210  0.333523 -0.551  0.58184   
## rash1            0.008298  1.008333  0.392928  0.021  0.98315   
## lymph1          -0.030840  0.969631  0.547496 -0.056  0.95508   
## vagina1          0.351398  1.421053  0.174868  2.010  0.04448 * 
## dchexam1        -0.463338  0.629180  0.230020 -2.014  0.04397 * 
## abnode1          0.170712  1.186149  0.433788  0.394  0.69392   
## age              0.008096  1.008129  0.013913  0.582  0.56066   
## yschool         -0.128072  0.879790  0.039366 -3.253  0.00114 **
## npartner         0.076670  1.079686  0.053888  1.423  0.15480   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.7156     1.3974    0.5337    0.9595
## iinfctboth         0.7653     1.3067    0.5704    1.0268
## maritalMarried     1.0566     0.9464    0.4537    2.4606
## maritalSingle      1.5040     0.6649    0.8430    2.6830
## raceWhite          0.8945     1.1179    0.6781    1.1800
## os12m1             0.8137     1.2289    0.5370    1.2330
## os30d1             0.7118     1.4049    0.4459    1.1363
## rs12m1             1.0345     0.9666    0.4323    2.4756
## rs30d1             0.8230     1.2150    0.2718    2.4918
## abdpain1           1.2577     0.7951    0.9260    1.7083
## discharge1         1.1215     0.8916    0.8965    1.4031
## dysuria1           1.1783     0.8487    0.8693    1.5971
## condomsometime     0.9379     1.0662    0.5864    1.5002
## condomnever        0.7247     1.3798    0.4459    1.1778
## itch1              0.8629     1.1589    0.6371    1.1687
## lesion1            0.8322     1.2016    0.4328    1.6000
## rash1              1.0083     0.9917    0.4668    2.1780
## lymph1             0.9696     1.0313    0.3316    2.8355
## vagina1            1.4211     0.7037    1.0087    2.0020
## dchexam1           0.6292     1.5894    0.4008    0.9876
## abnode1            1.1861     0.8431    0.5069    2.7758
## age                1.0081     0.9919    0.9810    1.0360
## yschool            0.8798     1.1366    0.8145    0.9504
## npartner           1.0797     0.9262    0.9715    1.2000
## 
## Concordance= 0.635  (se = 0.016 )
## Likelihood ratio test= 73.37  on 24 df,   p=7e-07
## Wald test            = 69.89  on 24 df,   p=2e-06
## Score (logrank) test = 71.71  on 24 df,   p=1e-06
## Single term deletions
## 
## Model:
## surv_object ~ iinfct + marital + race + os12m + os30d + rs12m + 
##     rs30d + abdpain + discharge + dysuria + condom + itch + lesion + 
##     rash + lymph + vagina + dchexam + abnode + age + yschool + 
##     npartner
##           Df    AIC     LRT Pr(>Chi)   
## <none>       4121.2                    
## iinfct     2 4122.2  4.9877 0.082590 . 
## marital    2 4119.8  2.6535 0.265345   
## race       1 4119.8  0.6300 0.427371   
## os12m      1 4120.2  0.9869 0.320512   
## os30d      1 4121.1  1.9585 0.161675   
## rs12m      1 4119.2  0.0058 0.939450   
## rs30d      1 4119.3  0.1175 0.731758   
## abdpain    1 4121.2  2.0656 0.150654   
## discharge  1 4120.2  1.0055 0.315972   
## dysuria    1 4120.2  1.0820 0.298256   
## condom     2 4122.2  5.0381 0.080535 . 
## itch       1 4120.1  0.9324 0.334250   
## lesion     1 4119.5  0.3156 0.574248   
## rash       1 4119.2  0.0004 0.983170   
## lymph      1 4119.2  0.0032 0.954914   
## vagina     1 4122.9  3.7409 0.053095 . 
## dchexam    1 4122.8  3.6128 0.057335 . 
## abnode     1 4119.3  0.1482 0.700309   
## age        1 4119.5  0.3321 0.564436   
## yschool    1 4129.7 10.5087 0.001188 **
## npartner   1 4121.0  1.8274 0.176434   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Call:
## coxph(formula = surv_object ~ iinfct + condom + vagina + dchexam + 
##     yschool, data = std)
## 
##   n= 877, number of events= 347 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## iinfctchlamydia -0.38686   0.67919  0.14645 -2.642  0.00825 ** 
## iinfctboth      -0.24904   0.77955  0.14590 -1.707  0.08783 .  
## condomsometime  -0.05806   0.94360  0.23080 -0.252  0.80139    
## condomnever     -0.35041   0.70440  0.23845 -1.469  0.14170    
## vagina1          0.40902   1.50534  0.16768  2.439  0.01472 *  
## dchexam1        -0.36961   0.69100  0.22127 -1.670  0.09484 .  
## yschool         -0.14396   0.86592  0.03337 -4.314  1.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6792     1.4723    0.5097    0.9050
## iinfctboth         0.7795     1.2828    0.5857    1.0376
## condomsometime     0.9436     1.0598    0.6002    1.4834
## condomnever        0.7044     1.4196    0.4414    1.1241
## vagina1            1.5053     0.6643    1.0837    2.0911
## dchexam1           0.6910     1.4472    0.4478    1.0662
## yschool            0.8659     1.1548    0.8111    0.9245
## 
## Concordance= 0.605  (se = 0.017 )
## Likelihood ratio test= 45.38  on 7 df,   p=1e-07
## Wald test            = 46.55  on 7 df,   p=7e-08
## Score (logrank) test = 46.79  on 7 df,   p=6e-08
## Call:
## coxph(formula = surv_object ~ iinfct + vagina + dchexam + yschool, 
##     data = std)
## 
##   n= 877, number of events= 347 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## iinfctchlamydia -0.36903   0.69140  0.14600 -2.528   0.0115 *  
## iinfctboth      -0.24433   0.78323  0.14541 -1.680   0.0929 .  
## vagina1          0.41548   1.51510  0.16745  2.481   0.0131 *  
## dchexam1        -0.36938   0.69116  0.22124 -1.670   0.0950 .  
## yschool         -0.15352   0.85768  0.03338 -4.599 4.24e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6914      1.446    0.5193    0.9205
## iinfctboth         0.7832      1.277    0.5890    1.0415
## vagina1            1.5151      0.660    1.0912    2.1037
## dchexam1           0.6912      1.447    0.4480    1.0664
## yschool            0.8577      1.166    0.8034    0.9157
## 
## Concordance= 0.592  (se = 0.017 )
## Likelihood ratio test= 38.48  on 5 df,   p=3e-07
## Wald test            = 39.42  on 5 df,   p=2e-07
## Score (logrank) test = 39.36  on 5 df,   p=2e-07
## [1] 4115.147
## [1] 4118.048
## Call:
## coxph(formula = surv_object ~ iinfct + condom + vagina + dchexam + 
##     yschool, data = std)
## 
##   n= 877, number of events= 347 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## iinfctchlamydia -0.38468   0.68067  0.14618 -2.632  0.00850 ** 
## iinfctboth      -0.24681   0.78129  0.14561 -1.695  0.09006 .  
## condomnever     -0.29779   0.74245  0.11551 -2.578  0.00994 ** 
## vagina1          0.40660   1.50171  0.16739  2.429  0.01514 *  
## dchexam1        -0.37042   0.69044  0.22123 -1.674  0.09405 .  
## yschool         -0.14357   0.86626  0.03332 -4.308 1.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6807     1.4692    0.5111    0.9065
## iinfctboth         0.7813     1.2799    0.5873    1.0393
## condomnever        0.7425     1.3469    0.5920    0.9311
## vagina1            1.5017     0.6659    1.0817    2.0848
## dchexam1           0.6904     1.4483    0.4475    1.0652
## yschool            0.8663     1.1544    0.8115    0.9247
## 
## Concordance= 0.603  (se = 0.017 )
## Likelihood ratio test= 45.32  on 6 df,   p=4e-08
## Wald test            = 46.51  on 6 df,   p=2e-08
## Score (logrank) test = 46.77  on 6 df,   p=2e-08
## Single term deletions
## 
## Model:
## surv_object ~ iinfct + condom + vagina + dchexam + yschool
##         Df    AIC
## <none>     4113.2
## iinfct   2 4115.9
## condom   1 4118.0
## vagina   1 4116.5
## dchexam  1 4113.7
## yschool  1 4129.9
## [1] 4113.209
cox.model = coxph(surv_object ~ iinfct+condom+vagina
                  +dchexam+yschool, data = std)
summary(cox.model)
## Call:
## coxph(formula = surv_object ~ iinfct + condom + vagina + dchexam + 
##     yschool, data = std)
## 
##   n= 877, number of events= 347 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## iinfctchlamydia -0.38468   0.68067  0.14618 -2.632  0.00850 ** 
## iinfctboth      -0.24681   0.78129  0.14561 -1.695  0.09006 .  
## condomnever     -0.29779   0.74245  0.11551 -2.578  0.00994 ** 
## vagina1          0.40660   1.50171  0.16739  2.429  0.01514 *  
## dchexam1        -0.37042   0.69044  0.22123 -1.674  0.09405 .  
## yschool         -0.14357   0.86626  0.03332 -4.308 1.64e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## iinfctchlamydia    0.6807     1.4692    0.5111    0.9065
## iinfctboth         0.7813     1.2799    0.5873    1.0393
## condomnever        0.7425     1.3469    0.5920    0.9311
## vagina1            1.5017     0.6659    1.0817    2.0848
## dchexam1           0.6904     1.4483    0.4475    1.0652
## yschool            0.8663     1.1544    0.8115    0.9247
## 
## Concordance= 0.603  (se = 0.017 )
## Likelihood ratio test= 45.32  on 6 df,   p=4e-08
## Wald test            = 46.51  on 6 df,   p=2e-08
## Score (logrank) test = 46.77  on 6 df,   p=2e-08

Checking the Model

cox.modelNoSchool <- coxph(surv_object ~ iinfct+condom+vagina+os30d+abdpain, data = std)
martNoSchool <- residuals(cox.modelNoSchool, type = "martingale")

# We plot martingale residuals to see if transformation is appropriate 
lowessOBJ <- as.data.frame(lowess(std$yschool, martNoSchool))

ggplotly(
  ggplot() + aes(std$yschool, martNoSchool) + geom_point() + 
    labs(x = "Years of School", y = "Martingale Residuals", title = "Martingale Residuals vs. Years of School") + 
    geom_line(data = lowessOBJ, aes(x = x, y = y), col = "#6495ed")
)
##          chisq df    p
## iinfct  3.5184  2 0.17
## condom  1.0539  1 0.30
## vagina  1.2835  1 0.26
## dchexam 0.4085  1 0.52
## yschool 0.0214  1 0.88
## GLOBAL  6.1176  6 0.41

Outliers

##     obs  race            marital age yschool    iinfct npartner os12m os30d
## 11   11 Black Divorced/Separated  32      12      both        6     1     1
## 366 366 White             Single  14       9 gonorrhea        1     0     0
## 525 525 Black             Single  15       8      both        1     1     1
## 831 831 White             Single  20      12 chlamydia       10     1     1
##     rs12m rs30d abdpain discharge dysuria condom itch lesion rash lymph vagina
## 11      0     0       1         1       0    use    0      0    0     0      1
## 366     0     0       0         0       1    use    0      0    0     0      0
## 525     0     0       0         1       0  never    0      0    0     0      1
## 831     1     1       0         1       0    use    1      0    0     0      1
##     dchexam abnode rinfct time
## 11        1      0      0 1468
## 366       1      0      0 1439
## 525       1      0      0 1005
## 831       0      0      0 1027